library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0
library(qiime2R) # Import qiime2 artifacts to R, [github::jbisanz/qiime2R] v0.99.35
library(phyloseq) # Handling and analysis of high-throughput microbiome census data, Bioconductor v1.34.0
library(microbiome) # Microbiome Analytics, Bioconductor v1.12.0
library(picante) # Integrating Phylogenies and Ecology, CRAN v1.8.2
library(MicrobeR) # Handy functions for microbiome analysis in R, [github::jbisanz/MicrobeR] v0.3.2
library(mixOmics) # Omics Data Integration, Bioconductor v6.14.0
library(RUVSeq) # Remove Unwanted Variation from RNA-Seq Data, Bioconductor v1.24.0
library(sva) # Surrogate Variable Analysis, Bioconductor v3.38.0
library(cowplot) # Streamlined Plot Theme and Plot Annotations for 'ggplot2', CRAN v1.1.1
library(ggpubr) # 'ggplot2' Based Publication Ready Plots, CRAN v0.4.0
library(ggh4x) # Hacks for 'ggplot2', CRAN v0.1.2.1
library(plotly) # Create Interactive Web Graphics via 'plotly.js', CRAN v4.9.3
library(RColorBrewer) # ColorBrewer Palettes, CRAN v1.1-2
library(vegan) # Community Ecology Package, CRAN v2.5-7
# metadata
mtd <- read_tsv(here("data/metadata.tsv"), comment = "#q2") %>%
rename(SampleID = "#SampleID") %>%
column_to_rownames("SampleID") %>%
mutate(PCRBatch = factor(PCRBatch),
Diet = factor(Diet, levels = c("REF", "IM")),
Compartment = factor(Compartment, levels = c("PI", "DI")),
SampleOrigin = factor(
SampleOrigin,
levels = c("Feed", "Water", "Digesta", "Mucosa",
"Mock", "DNA-extraction", "Amplicon-PCR")),
SampleType = factor(
SampleType,
levels = c("REF-PID", "REF-PIM", "REF-DID", "REF-DIM",
"IM-PID", "IM-PIM", "IM-DID", "IM-DIM",
"Feed", "Water", "Extraction-blank", "PCR-blank", "Mock")))
# feature table
otu <- read_qza(here("data/intermediate/qiime2/97otu/table-filtered-97otu-sepp-inserted.qza")) %>%
pluck("data") %>%
as("matrix")
# taxonomy
txnm <- read_qza(here("data/intermediate/qiime2/asv/taxonomy-silva132.qza"))
txnm <- txnm$data %>%
as.data.frame() %>%
mutate(Taxon = gsub("D_0", "k", Taxon), Taxon = gsub("D_1", "p", Taxon),
Taxon = gsub("D_2", "c", Taxon), Taxon = gsub("D_3", "o", Taxon),
Taxon = gsub("D_4", "f", Taxon), Taxon = gsub("D_5", "g", Taxon),
Taxon = gsub("D_6", "s", Taxon)) %>%
separate(Taxon, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>%
column_to_rownames("Feature.ID") %>%
select(-Confidence)
# phylogeny
tree <- read_qza(here("data/intermediate/qiime2/97otu/insertion-tree-97otu.qza"))
# assemble a phyloseq object
ps <- phyloseq(sample_data(mtd),
otu_table(otu, taxa_are_rows = TRUE),
tax_table(as.matrix(txnm)),
phy_tree(tree$data))
# change OTU names
indx <- formatC(1:ntaxa(ps), width = nchar(ntaxa(ps)), format = "d", flag = "0")
taxa_names(ps) <- paste0("OTU", indx)
# filter data
ps <- subset_samples(ps, !SampleType %in% c("Extraction-blank", "PCR-blank", "Mock")) # remove control samples
# extract metadata, otu table, taxonomy and phylogeny from phyloseq
mtd <- data.frame(sample_data(ps), check.names = FALSE)
otu <- as.data.frame(otu_table(ps)) %>% t()
txnm <- tax_table(ps) %>% as("matrix") %>% as.data.frame()
tree <- phy_tree(ps)
# Centered log-ratio transformation
otu_clr <- logratio.transfo(otu, logratio = "CLR", offset = 1 ) %>% as("matrix")
# PCA
pca_before <- pca(otu_clr, ncomp = 3)
# PCA plot
pcaPlot_before <- cbind(mtd, pca_before$variates$X) %>%
ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
geom_point(size = 2) +
geom_line(aes(group = SampleName), color = "black") + # connect paired technical replicates by line
labs(title = "Before batch correction", color = "Sample type", shape = "PCR batch",
x = paste0("PC1: ", round(100 * pca_before$explained_variance[1], 2), "%"),
y = paste0("PC2: ", round(100 * pca_before$explained_variance[2], 2), "%")) +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
theme_bw()
pcaPlot_before
From the PCA plot, we can tell that technical replicate samples are far apart. Mucosa samples from fish fed the REF diet form 2 clusters based on PCR batches. There’s a strong PCR batch effect.
RUVSeq removes unwanted variation in RNASeq data using replicate samples and negative control genes. Here we use it for correcting batch effects in microbiome data. The two main parameters of RUVSeq are the number of factors of unwanted variation, k, and the set of negative control variables genes, or OTUs in our case. The choice of the parameter k is not easy and is data-dependent. Empirically, a few (2-3) factors are enough to capture the unwanted variation. In noisy data, K can be increased to 5 or 10. Note that if k is too high, the RUVSeq over-corrects for unwanted variation and ends up removing (almost) all the biological variability within the conditions. The choice of negative control variables is also somewhat data-dependent. However, RUVSeq is robust to the choice of negative control variables. One is recommend to perform extensive exploratory data analysis, comparing different values of k and sets of negative control variables.
# calculate coefficient of variation (cv) for each otu
cv <- t(otu) %>%
as.data.frame() %>%
rownames_to_column("featureID") %>%
mutate(sd = apply(.[, names(.) != "featureID"], 1, sd),
mean = apply(.[, names(.) != "featureID"], 1, mean),
cv = sd / mean) %>%
arrange(cv)
# proportions of otus used as negative controls
prp <- c(0.1, 0.2, 0.5, 1)
# number of unwanted variation factors
k <- c(1, 2, 3, 4, 5, 10)
# get combinations of k and prp for parameter tuning
prmt <- expand.grid(prp = prp, k = k)
# sample replicate
SampleType <- mtd$SampleType
names(SampleType) <- rownames(mtd)
rpl <- makeGroups(SampleType)
# batch effect correction
ruv <- map2(
prmt$prp,
prmt$k,
function(x, y)
{
# get otus showing least variance at defined proportions
min_cv <- cv[1:round(nrow(cv) * x), ] %>%
mutate(featureID = paste0("^", featureID, "$")) # exact math of otu ids
# subset otus used as negative controls
nc <- grepl(paste(min_cv$featureID, collapse = "|"), colnames(otu))
# run RUVs
ruv <- RUVs(t(otu), nc, y, rpl)
}
)
# assign names to list elements
names(ruv) <- paste0("ruv_k", prmt$k, "_nc", prmt$prp)
ComBat-seq is a batch effect correction method for RNASeq data. It is an improved model based on the popular effect correction tool, ComBat.ComBat-seq takes raw count matrix as input. Same as ComBat, it requires a known batch variable.
# PCR batch
PCRBatch <- mtd$PCRBatch
names(PCRBatch) <- rownames(mtd)
# model matrix
mod_mtrx <- model.matrix( ~ SampleType)
# run CombatSeq
cmbt <- ComBat_seq(t(otu), batch = PCRBatch, covar_mod = mod_mtrx)
## Found 3 batches
## Using null model in ComBat-seq.
## Adjusting for 9 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
Extract and log ratio transform batch effect corrected OTU table.
ruv_clr <- purrr::map(ruv, ~pluck(.x, "normalizedCounts")) %>% # get batch corrected count table
purrr::map(~t(.x)) %>% # transpose normalized count table
purrr::map(~logratio.transfo(.x, logratio = 'CLR', offset = 1)) %>% # CLR transformation
purrr::map(~as(.x, "matrix"))
Run PCA.
pca_ruv <- purrr::map(ruv_clr, ~pca(.x, ncomp = 3))
Make PCA plots.
The PCA plots with k = 4, 5 or 10 look promising. Let’s look at one of them in 3d dimensions.
# extract data
k4_nc1 <- cbind(mtd, pca_ruv$ruv_k4_nc1$variates$X)
# 3d PCA plot with plot_ly
plot_ly(
x = k4_nc1[, "PC1"], y = k4_nc1[, "PC2"], z = k4_nc1[, "PC3"],
type = "scatter3d", mode = "markers", color = k4_nc1$SampleType,
colors = "Paired") %>% #brewer.pal(n = 12, name = "Paired")
layout(scene = list(
xaxis = list(title = paste0('PC1: ', round(pca_ruv$ruv_k4_nc1$explained_variance[1]*100, 2), "%")),
yaxis = list(title = paste0('PC2: ', round(pca_ruv$ruv_k4_nc1$explained_variance[2]*100, 2), "%")),
zaxis = list(title = paste0('PC3: ', round(pca_ruv$ruv_k4_nc1$explained_variance[3]*100, 2), "%"))
)
)
# centered log-ratio transformation
cmbt_clr <- logratio.transfo(t(cmbt), logratio = 'CLR', offset = 1) %>%
as("matrix")
# PCA
pca_cmbt <- pca(cmbt_clr, ncomp = 3)
# make PCA plot
pcaPlot_cmbt <- cbind(mtd, pca_cmbt$variates$X) %>%
ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
geom_point(size = 2) +
geom_line(aes(group = SampleName), color = "black") +
labs(title = "ComBat-Seq",
color = "Sample type", shape = "PCR batch",
x = paste0("PC1: ", round(100 * pca_cmbt$explained_variance[1], 2), "%"),
y = paste0("PC2: ", round(100 * pca_cmbt$explained_variance[2], 2), "%")) +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
theme_bw()
pcaPlot_cmbt
The partial redundancy analysis (pRDA) is a multivariate method to assess globally the effect of treatments and batch. Here, we run pRDA and get variance explained by treatments and batch.
# combine otu table before and after batch corrections in a list
otus <- c(before = list(otu_clr), ruv_clr, combatseq = list(cmbt_clr))
# merge otu tables and nest by batch correction methods
otus_nst <- purrr::map(otus, as.data.frame) %>%
bind_rows(.id = "batch_correction") %>%
group_nest(batch_correction, .key = "data")
# define experiment design
design <- data.frame(SampleType = rep(SampleType, length(otus)),
PCRBatch = rep(PCRBatch, length(otus)),
batch_correction = rep(names(otus), each = nrow(mtd)))
design_nst <- group_nest(design, batch_correction, .key = "design")
# run pRDA and extract variance explained by treatment and batch effects
rda <- inner_join(otus_nst, design_nst, by = "batch_correction") %>%
mutate(rda_trt = map2(data, design, ~rda(.x ~ PCRBatch + Condition(SampleType), data = .y)),
rda_bat = map2(data, design, ~rda(.x ~ SampleType + Condition(PCRBatch), data = .y)),
var_trt = map_dbl(rda_trt, ~.x$pCCA$tot.chi*100/.x$tot.chi),
var_bat = map_dbl(rda_bat, ~.x$pCCA$tot.chi*100/.x$tot.chi),
batch_correction = factor(batch_correction, levels = names(otus))) %>%
select(batch_correction, var_trt, var_bat) %>%
rename(Method = batch_correction, Treatment = var_trt, Batch = var_bat) %>%
pivot_longer(Treatment:Batch, names_to = "Type", values_to = "var_expl") %>%
mutate(var_expl = round(var_expl, 2))
Make bar plots showing variance explained by treatments and batch.
var_expl <- ggplot(rda, aes(x = fct_rev(Method), y = var_expl, fill = Type)) +
geom_bar(stat = "identity", position = 'dodge', colour = 'black') +
geom_text(aes(Method, var_expl + 5, label = var_expl), size = 3,
position = position_dodge(width = 0.9)) +
coord_flip() +
scale_y_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
labs(x = "", y = "Variance explained (%)") +
theme_bw(base_size = 12) +
guides(fill = guide_legend(reverse = TRUE))
var_expl
Based on the PCA plots and partial redundancy analysis (pRDA), we can conclude that the RUVSeq provides better batch effect corrections than the Combat-Seq does. In agreement with previous data, the RUVSeq is robust to the choice of negative control variables. Increasing the number of factors of unwanted variation, k, removes more unwanted variation. Setting k = 4, 5 or 10 yields very similar results. To avoid over-correcting batch effects, we choose the smallest K value, ie., k = 4 and nc = 1.
Get batch effect corrected OTU table and replace the original OTU table in the phyloseq object.
# extract batch effect corrected OTU table
otu_adj <- ruv$ruv_k4_nc1$normalizedCounts
# replace otu table in the phyloseq object
otu_table(ps) <- otu_table(otu_adj, taxa_are_rows = TRUE)
Make a table containing top 10 most abundant taxa at genus level.
taxa_tab <- Summarize.Taxa(otu_adj, txnm)$Genus %>% Make.Percent()
taxa_tab <- taxa_tab[order(rowMeans(taxa_tab), decreasing = T), ]
Others <- colSums(taxa_tab[11:nrow(taxa_tab), ])
taxa_tab <- rbind(taxa_tab[1:10, ], Others)
Tidy dataframe for making stacked bar plots.
taxa_tab <- as.data.frame(taxa_tab) %>%
rownames_to_column("Taxa") %>%
separate(Taxa, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")) %>%
mutate(Phylum = ifelse(is.na(Phylum)|Phylum == "NA"|grepl("uncultured|Ambiguous|metagenome", Phylum),
Kingdom,
Phylum),
Class = ifelse(is.na(Class)|Class == "NA"|grepl("uncultured|Ambiguous|metagenome", Class),
Phylum,
Class),
Order = ifelse(is.na(Order)|Order == "NA"|grepl("uncultured|Ambiguous|metagenome", Order),
Class,
Order),
Family = ifelse(is.na(Family)|Family == "NA"|grepl("uncultured|Ambiguous|metagenome", Family),
Order,
Family),
Genus = ifelse(is.na(Genus)|Genus == "NA"|grepl("uncultured|Ambiguous|metagenome", Genus),
Family,
Genus)) %>%
select(-Kingdom, -(Class:Family)) %>%
pivot_longer(-c(Phylum, Genus), names_to = "SampleID", values_to = "Abundance") %>%
mutate(Phylum = gsub("p__", "", Phylum),
Genus = gsub("g__", "", Genus),
Genus = factor(Genus, levels = rev(unique(Genus)))) %>%
inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID")
Make stacked bar plots.
# define color scheme
col <- c("grey", brewer.pal(n = 10, name = "Paired"))
# water samples
taxa_bar_water <- filter(taxa_tab, SampleType == "Water") %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
guides(fill=guide_legend(ncol=1)) +
facet_wrap(~ SampleOrigin) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.margin = margin(0, 0, 0, 0),
legend.box.margin = margin(5, -10, 5, -8))
# feed samples
taxa_bar_feed <- filter(taxa_tab, SampleType == "Feed") %>%
ggplot(aes(x = Diet, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity", width = 0.5) +
labs(x = "Sample", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ SampleOrigin + Diet, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# samples from fish fed the REF diet
taxa_bar_ref <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
filter(grepl("REF-", SampleType)) %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "Relative abundance (%)") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# samples from fish fed the IM diet
taxa_bar_im <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
filter(grepl("IM-", SampleType)) %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# assemble plots
plot_grid(
taxa_bar_ref,
taxa_bar_feed + theme(plot.margin = margin(l = -0.5, unit = "cm")),
taxa_bar_im + theme(plot.margin = margin(l = -0.4, unit = "cm")),
taxa_bar_water + theme(plot.margin = margin(l = -0.3, unit = "cm")),
nrow = 1, align = 'h', axis = "tb",
rel_widths = c(6, 1.5, 6, 4.5))
Compared to the taxa barplot (data/intermediate/qiime2/97otu/taxa-bar-plots-filtered-97otu.qzv) before batch correction , the taxonomic composition of samples is very different after the batch correction by RUVSeq.
Compute alpha diversity indices.
# compute Observed features, Peilou's Evenness and Shannon's index
alph_npd <- alpha(ps, index = c("observed", "evenness_pielou", "diversity_shannon")) %>%
rownames_to_column("SampleID")
# compute Faith's Phylogenetic Diversity
alph_pd <- pd(samp = t(otu_table(ps)), tree = phy_tree(ps), include.root = F) %>%
select(PD) %>%
rownames_to_column("SampleID")
# combine data
alph <- inner_join(alph_npd, alph_pd, by = "SampleID") %>%
inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID") %>%
rename("Observed OTUs" = observed, "Pielou's evenness" = evenness_pielou,
"Shannon's index" = diversity_shannon, "Faith's PD" = PD) %>%
pivot_longer("Observed OTUs":"Faith's PD", names_to = "alph_indx", values_to = "value") %>%
mutate(alph_indx = factor(alph_indx, levels = c("Observed OTUs", "Pielou's evenness",
"Shannon's index", "Faith's PD")))
Alpha diversity of technical replicates.
filter(alph, IsTechnicalReplicate == "yes") %>%
ggplot(aes(x = PCRBatch, y = value)) +
geom_boxplot(aes(color = PCRBatch), width = 0.3) +
geom_point(aes(color = PCRBatch)) +
geom_line(aes(group = SampleName), linetype = "dashed") +
labs(x = "PCR batch", color = "PCR batch") +
facet_wrap(~alph_indx, scales = "free_y") +
scale_fill_brewer(palette = "Dark2") +
theme_bw()
Alpha diversity of REF-PIM and REF-DIM samples amplified in different PCR batches.
filter(alph, SampleType %in% c("REF-PIM", "REF-DIM")) %>%
ggplot(aes(x = SampleType, y = value)) +
geom_boxplot(width = 0.3) +
geom_point(aes(color = PCRBatch)) +
labs(x = "Sample type", color = "PCR batch") +
facet_wrap(~ alph_indx, scales = "free_y") +
scale_fill_brewer(palette = "Dark2") +
theme_bw()
In line with previous observations, unweighted alpha indices such as Observed OTUs and Faith’s PD are more easily influenced by technical variations than weighted alpha indices like Shannon’s index.
OTU table summary.
sampleSum <- colSums(otu_adj) %>% sort()
sampleSum
## AqFl1-134 AqFl1-031 AqFl1-073 AqFl1-087 AqFl1-033 AqFl1-058 AqFl1-083 AqFl1-089
## 703 1689 1696 1874 1906 1924 1960 1969
## AqFl1-069 AqFl1-133 AqFl1-062 AqFl1-059 AqFl1-128 AqFl1-024 AqFl1-091 AqFl1-065
## 1974 1975 2061 2072 2077 2132 2162 2224
## AqFl1-078 AqFl1-053 AqFl1-020 AqFl1-017 AqFl1-124 AqFl1-126 AqFl1-002 AqFl1-094
## 2263 2281 2300 2308 2330 2400 2460 2502
## AqFl1-052 AqFl1-012 AqFl1-061 AqFl1-001 AqFl1-044 AqFl1-004 AqFl1-049 AqFl1-077
## 2509 2525 2554 2606 2615 2618 2639 2684
## AqFl1-009 AqFl1-129 AqFl1-021 AqFl1-025 AqFl1-035 AqFl1-014 AqFl1-047 AqFl1-041
## 2710 2734 2759 2778 2783 2807 2886 2895
## AqFl1-038 AqFl1-055 AqFl1-029 AqFl1-074 AqFl1-007 AqFl1-063 AqFl1-068 AqFl1-045
## 2936 2958 3081 3123 3182 3387 3387 3549
## AqFl1-027 AqFl1-088 AqFl1-072 AqFl1-096 AqFl1-042 AqFl1-015 AqFl1-032 AqFl1-123
## 3741 3900 3906 3915 3982 4054 4113 4411
## AqFl1-057 AqFl1-125 AqFl1-095 AqFl1-080 AqFl1-127 AqFl1-084 AqFl1-081 AqFl1-013
## 4636 4652 4800 4897 4941 4971 5084 5206
## AqFl1-066 AqFl1-056 AqFl1-076 AqFl1-054 AqFl1-040 AqFl1-043 AqFl1-022 AqFl1-130
## 5256 5304 5387 5648 5651 5794 5980 6015
## AqFl1-030 AqFl1-023 AqFl1-008 AqFl1-003 AqFl1-046 AqFl1-086 AqFl1-060 AqFl1-010
## 6062 6175 6343 6356 6800 6828 6852 7435
## AqFl1-064 AqFl1-079 AqFl1-028 AqFl1-011 AqFl1-034 AqFl1-070 AqFl1-039 AqFl1-016
## 7552 7751 7909 8242 8313 8424 8630 8739
## AqFl1-019 AqFl1-050 AqFl1-092 AqFl1-026 AqFl1-051 AqFl1-082 AqFl1-006 AqFl1-093
## 9067 9146 9188 9715 10040 10188 10891 11007
## AqFl1-071 AqFl1-037 AqFl1-075 AqFl1-018 AqFl1-036 AqFl1-090 AqFl1-067 AqFl1-115
## 11121 11739 11916 12071 12602 12605 13519 14355
## AqFl1-005 AqFl1-118 AqFl1-048 AqFl1-119 AqFl1-116 AqFl1-120 AqFl1-121 AqFl1-117
## 14919 16104 17409 31981 35889 37743 38256 42168
## AqFl1-122
## 48477
We lost quite a lot of sequences after the batch effect correction. Here we rarefy OTU table at a sub-sampling depth of 1689 sequences.
ps_rf <- rarefy_even_depth(ps, sample.size = sampleSum[2], rngseed = 1910)
# pcoa analysis
ord_jac <- ordinate(ps_rf, method = "PCoA", distance = "jaccard")
# ordination plot
plot_ordination(ps_rf, ord_jac, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of Jaccard distance after batch correcton by RUVSeq") +
theme_bw()
# pcoa analysis
ord_bc <- ordinate(ps_rf, method = "PCoA", distance = "bray")
# ordination plot
plot_ordination(ps_rf, ord_bc, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of Bray-Curtis distance after batch correcton by RUVSeq") +
theme_bw()
Unweighted UniFrac distance.
# pcoa analysis
ord_uwuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = FALSE)
# ordination plot
plot_ordination(ps_rf, ord_uwuf, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of unweighted UniFrac distance after batch correcton by RUVSeq") +
theme_bw()
Weighted UniFrac distance.
# pcoa analysis
ord_wuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = TRUE)
# ordination plot
plot_ordination(ps_rf, ord_wuf, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of weighted UniFrac distance after batch correcton by RUVSeq") +
theme_bw()
PcoA plots based on various distance metrics show that batch effect is partly removed after the adjustment by RUVSeq.
RUVSeq largely corrected PCR batch effects. However, the batch correction removed a large proportions of sequences in many samples and resulted in very different taxonomic compositions. Hence, we’ll not correct the PCR batch effect but rather analyze the data separately or account for batch effect in the statistical models where applicable.
I’d like to thank Wang and Lê Cao for publishing the code along with their paper “Managing batch effects in microbiome data”. Part of their code is used for the present analysis.
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
## system code page: 65001
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 plotly_4.9.3
## [3] ggh4x_0.1.2.1 ggpubr_0.4.0
## [5] cowplot_1.1.1 sva_3.38.0
## [7] genefilter_1.72.0 mgcv_1.8-34
## [9] RUVSeq_1.24.0 edgeR_3.32.0
## [11] limma_3.46.0 EDASeq_2.24.0
## [13] ShortRead_1.48.0 GenomicAlignments_1.26.0
## [15] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.0
## [17] matrixStats_0.58.0 Rsamtools_2.6.0
## [19] GenomicRanges_1.42.0 GenomeInfoDb_1.26.0
## [21] Biostrings_2.58.0 XVector_0.30.0
## [23] IRanges_2.24.0 S4Vectors_0.28.0
## [25] BiocParallel_1.24.1 Biobase_2.50.0
## [27] BiocGenerics_0.36.0 mixOmics_6.14.0
## [29] MASS_7.3-53.1 MicrobeR_0.3.2
## [31] picante_1.8.2 nlme_3.1-149
## [33] vegan_2.5-7 lattice_0.20-41
## [35] permute_0.9-5 ape_5.4-1
## [37] microbiome_1.12.0 phyloseq_1.34.0
## [39] qiime2R_0.99.35 forcats_0.5.1
## [41] stringr_1.4.0 dplyr_1.0.5
## [43] purrr_0.3.4 readr_1.4.0
## [45] tidyr_1.1.3 tibble_3.1.0
## [47] ggplot2_3.3.3 tidyverse_1.3.0
## [49] here_1.0.1 knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.50.0 R.methodsS3_1.8.1
## [4] bit64_4.0.5 aroma.light_3.20.0 DelayedArray_0.16.0
## [7] R.utils_2.10.1 data.table_1.14.0 rpart_4.1-15
## [10] hwriter_1.3.2 RCurl_1.98-1.2 generics_0.1.0
## [13] GenomicFeatures_1.42.0 RSQLite_2.2.4 bit_4.0.4
## [16] xml2_1.3.2 lubridate_1.7.10 assertthat_0.2.1
## [19] xfun_0.22 hms_1.0.0 jquerylib_0.1.3
## [22] evaluate_0.14 fansi_0.4.2 progress_1.2.2
## [25] dbplyr_2.1.0 readxl_1.3.1 igraph_1.2.6
## [28] DBI_1.1.1 htmlwidgets_1.5.3 rARPACK_0.11-0
## [31] ellipsis_0.3.1 crosstalk_1.1.1 RSpectra_0.16-0
## [34] backports_1.2.1 annotate_1.68.0 biomaRt_2.46.0
## [37] vctrs_0.3.6 abind_1.4-5 cachem_1.0.4
## [40] withr_2.4.1 checkmate_2.0.0 treeio_1.14.0
## [43] prettyunits_1.1.1 cluster_2.1.0 lazyeval_0.2.2
## [46] crayon_1.4.1 ellipse_0.4.2 pkgconfig_2.0.3
## [49] zCompositions_1.3.4 labeling_0.4.2 nnet_7.3-14
## [52] rlang_0.4.10 lifecycle_1.0.0 BiocFileCache_1.14.0
## [55] modelr_0.1.8 cellranger_1.1.0 rprojroot_2.0.2
## [58] Matrix_1.3-2 aplot_0.0.6 phangorn_2.5.5
## [61] carData_3.0-4 Rhdf5lib_1.12.0 reprex_1.0.0
## [64] base64enc_0.1-3 png_0.1-7 viridisLite_0.3.0
## [67] bitops_1.0-6 R.oo_1.24.0 rhdf5filters_1.2.0
## [70] blob_1.2.1 jpeg_0.1-8.1 rstatix_0.7.0
## [73] DECIPHER_2.18.1 ggsignif_0.6.1 rtk_0.2.6.1
## [76] scales_1.1.1 memoise_2.0.0 magrittr_2.0.1
## [79] plyr_1.8.6 zlibbioc_1.36.0 compiler_4.0.3
## [82] philr_1.16.0 cli_2.3.1 ade4_1.7-16
## [85] patchwork_1.1.1 htmlTable_2.1.0 Formula_1.2-4
## [88] tidyselect_1.1.0 stringi_1.5.3 highr_0.8
## [91] yaml_2.2.1 askpass_1.1 locfit_1.5-9.4
## [94] latticeExtra_0.6-29 ggrepel_0.9.1 grid_4.0.3
## [97] sass_0.3.1 fastmatch_1.1-0 tools_4.0.3
## [100] rio_0.5.26 rstudioapi_0.13 foreach_1.5.1
## [103] foreign_0.8-81 gridExtra_2.3 farver_2.1.0
## [106] Rtsne_0.15 digest_0.6.27 rvcheck_0.1.8
## [109] BiocManager_1.30.10 quadprog_1.5-8 Rcpp_1.0.6
## [112] car_3.0-10 broom_0.7.5 httr_1.4.2
## [115] AnnotationDbi_1.52.0 colorspace_2.0-0 rvest_1.0.0
## [118] XML_3.99-0.5 fs_1.5.0 truncnorm_1.0-8
## [121] splines_4.0.3 tidytree_0.3.3 multtest_2.46.0
## [124] xtable_1.8-4 jsonlite_1.7.2 ggtree_2.4.0
## [127] corpcor_1.6.9 R6_2.5.0 Hmisc_4.5-0
## [130] NADA_1.6-1.1 pillar_1.5.1 htmltools_0.5.1.1
## [133] glue_1.4.2 fastmap_1.1.0 DT_0.17
## [136] codetools_0.2-16 utf8_1.2.1 bslib_0.2.4
## [139] curl_4.3 zip_2.1.1 openxlsx_4.2.3
## [142] openssl_1.4.3 survival_3.2-10 rmarkdown_2.7
## [145] biomformat_1.18.0 munsell_0.5.0 rhdf5_2.34.0
## [148] GenomeInfoDbData_1.2.4 iterators_1.0.13 haven_2.3.1
## [151] reshape2_1.4.4 gtable_0.3.0